
libname ST "E:\StockTwits"; run;   								**** Where I want to store main files ****;

libname Echo "C:\Users\jengelberg\Dropbox\Echo"; run;         	**** Where I want to share main files ****;



*****************************************************************************************************************
*****************************************************************************************************************

1. Read in Message Dataset

*****************************************************************************************************************
*****************************************************************************************************************;

filename FT77F004 "E:\StockTwits\Legacy\stocktwits_legacy_messages*";


Data ST.Master;
INFILE FT77F004 dsd DLM='*' dsd lrecl=12336;
informat var $12336.;
input var $ @@;

Format UserID 12. Message $150. FollowersCount 12. FollowingCount 12. FollowingStocksCount 12. StatusesCount 12.
	   Approach $14. Experience $14.
	   MessageID 12. SymbolID 12. SymbolText $412. Num_Symbols 12. Symbol_Name $36. Symbol_Ticker $8.
       Bear 12. Bull 12. Date Date8. Time Time8. Part1 $412. Text1 $12336.  Text2 $12336.;

If index(var, '{"verb":"post","id":"tag:firehose.stocktwits.com:note/') ne 0 and 
index(var, '{"id":"person:stocktwits:') ne 0 and
index(var, '{"id":"note:stocktwits:') ne 0 and
index(var, ',"postedTime":"') ne 0 

/*
and index(var, '"symbols":[{"displayName":') ne 0 
and index(var, '"sentiment":{"basic":"') ne 0
*/

then do;

Num1 = index(var, '{"verb":"post","id":"tag:firehose.stocktwits.com:note/');
Num2 = index(var, '{"id":"person:stocktwits:');
Num3 = index(var, '{"id":"note:stocktwits:');
Num4 = index(var, ',"postedTime":"');
Num5 = index(var, '"symbols":[{"displayName":');
Num6 = index(var, '"sentiment":{"basic":"');

Num7 = index(var, '","body":"') + 10;
Num8 = index(var, '","actor":{') - Num7;

Num9 = Num5 + 26;
Num10 = Num6 - Num9;



MessageID = Input( scan( substr(var, Num1), 9, '/"'), 12.);

UserID = Input( scan( substr(var, Num2), 5, ':"'), 12.);

Text1 = substr(var, Num2);

Num2a = index(Text1, '"followersCount":');
Num2b = index(Text1, '"followingCount":');
Num2c = index(Text1, 'followingStocksCount":');
Num2d = index(Text1, '"statusesCount":');
Num2e = index(Text1, '"approach":');
Num2f = index(Text1, '"experience":');

FollowersCount = Input( scan( substr(Text1, Num2a), 2, ':",'), 12.);
FollowingCount = Input( scan( substr(Text1, Num2b), 2, ':",'), 12.);
FollowingStocksCount = Input( scan( substr(Text1, Num2c), 2, ':",'), 12.);
StatusesCount = Input( scan( substr(Text1, Num2d), 2, ':",'), 12.);
Approach = compress( trim(left(scan( substr(Text1, Num2e), 2, ':,'))), '"}');
Experience = compress( trim(left(scan( substr(Text1, Num2f), 2, ':,'))), '"}');

Message = trim(left(substr(var, Num7, Num8)));

MessageID2 = Input( scan( substr(var, Num3), 5, ':"'), 12.);

Part1 = compress( scan( substr(var, Num4), 4, '"'), '"');

Year = Input( scan(Part1, 1, '-TZ:+'), 12.);
Month = Input( scan(Part1, 2, '-TZ:+'), 12.);
Day = Input( scan(Part1, 3, '-TZ:+'), 12.);

Hour = Input( scan(Part1, 4, '-TZ:+'), 12.);
Minute = Input( scan(Part1, 5, '-TZ:+'), 12.);
Second = Input( scan(Part1, 6, '-TZ:+'), 12.);

Date = MDY(Month, Day, Year);

Time = HMS(Hour, Minute, Second);


Text2 = substr(var, Num5);

Num5a = index(Text2, '"stocktwits_id":');

SymbolID = Input( scan( substr(Text2, Num5a), 2, ':",'), 12.);

SymbolText = trim(left( substr(var, Num9, Num10)  ));

Symbol_Name = trim(left( compress( scan(SymbolText, 1, ','), '"') ));


Num11 = index(SymbolText, ',"symbol":') + 10;
Text3 = substr(SymbolText, Num11);

Symbol_Ticker = trim(left( compress( scan(Text3, 1, ','), '"') ));

Num_Symbols = Count(SymbolText, '"stocktwits_id":');

Sent = scan( substr(var, Num6), 4, '":,');

If trim(left(Sent)) = "Bearish" then Bear = 1;
If trim(left(Sent)) = "Bullish" then Bull = 1;

output;

end;

Keep  Message UserID FollowersCount FollowingCount FollowingStocksCount StatusesCount
	   Approach Experience Num_Symbols SymbolText Symbol_Name Symbol_Ticker
	   MessageID SymbolID
       Bear Bull Date Time;
run;



*****************************************************************************************************************
*****************************************************************************************************************

2. Have a smaller master dataset that just has key variables, not the messages

*****************************************************************************************************************
*****************************************************************************************************************;

Data ST.Master_Small;
Set ST.Master;

Keep UserID Approach Experience Num_Symbols MessageID SymbolID Symbol_Name Symbol_Ticker Bear Bull Date;
run;



*****************************************************************************************************************
*****************************************************************************************************************

3. Take Master_Small, apply our filters and aggregate to the (symbol, user, day) level

*****************************************************************************************************************
*****************************************************************************************************************;

Data Temp1;
Set ST.Master_Small;

If Date >= MDY(1, 1, 2013);  			*** Start with Jan 1 2013 ;

If Bear ne . or Bull ne .;   			*** Only consider sentiment messages;

If SymbolID ne . and Num_Symbols=1;		*** Only consider messages about a single symbol;

Keep Date Bull Bear UserID SymbolID;
run;

Proc Sort Data=Temp1; by UserID SymbolID Date; run;


***** Calculate how many Bull and Bear messages per user each day ****;
Proc Univariate Data=Temp1 noprint;
by UserID SymbolID Date;
Var Bull Bear;
output out=Temp1 sum=Bull Bear; run;


**** Delete Users where we detect ever 1000+ messages per day and stocks with less than 2000 User-days ****;
Data Robots;
Set Temp1;

If Bull = . then Bull = 0;
If Bear = . then Bear = 0;

Total_messages = Bull + Bear;

run;

Data Tiny_Stocks;
Set Temp1;

Message = 1;

run;

Proc Sort Data=Robots; by UserID Date; run;

Proc Univariate Data=Robots noprint;
by UserID Date;
Var Total_messages;
output out=Robots sum=Sum_Total_messages; run;

Data Robots;
Set Robots;

If Sum_Total_messages >= 1000;

Robot = 1;

Keep UserID Robot;
run;

Proc Sort Data=Robots nodupkey; by UserID; run;

Proc Sort Data=Tiny_Stocks; by SymbolID; run;

Proc Univariate Data=Tiny_Stocks noprint;
by SymbolID;
Var Message;
output out=Tiny_Stocks sum=Sum_Total_messages; run;

Data Tiny_Stocks;
Set Tiny_Stocks;

If Sum_Total_messages <= 2000;

Tiny_Stock = 1;

Keep SymbolID Tiny_Stock;
run;

Proc Sort Data=Tiny_Stocks nodupkey; by SymbolID; run;

Proc Sql;
Create Table Temp1 AS
SELECT *
FROM Temp1 a LEFT JOIN Robots b
ON (a.UserID = b.UserID);
QUIT; RUN;

Proc Sql;
Create Table Temp1 AS
SELECT *
FROM Temp1 a LEFT JOIN Tiny_Stocks b
ON (a.SymbolID = b.SymbolID);
QUIT; RUN;

Data Temp1;
Set Temp1;

If Robot = 1 then delete;

If Tiny_Stock = 1 then delete;

run;

***** Get rid of schizophrenic users/observations and maybe bots/not-real users ****;
Data Temp1;
Set Temp1;

Bull_Count = Bull;
If Bull_Count = . then Bull_Count = 0;

Bear_Count = Bear;
If Bear_Count = . then Bear_Count = 0;

Share_Bull = Bull_Count/(Bull_Count + Bear_Count);
Share_Bear = Bear_Count/(Bull_Count + Bear_Count);

If Share_Bull >= .9 then Final_Bull = 1;
If Share_Bear >= .9 then Final_Bear = 1;

If Final_Bull = 1 or Final_Bear = 1;

Drop Bull Bear Bull_Count Bear_Count Robot Tiny_Stock Share_Bull Share_Bear;
run;

Data Temp1;
Set Temp1;

Bull = Final_Bull;
Bear = Final_Bear;

Drop Final_Bull Final_Bear;
run;

Data ST.USD_Panel;
Set Temp1; run;    ********** This is the final output: a panel dataset at the user-stock-day level ****;



*****************************************************************************************************************
*****************************************************************************************************************

4. Now take that Panel dataset and make a copy of it....we'll use this when matching/merging users using following data
	Also have a copy version at the message (rather than user-day-symbol) level

*****************************************************************************************************************
*****************************************************************************************************************;


***** Prep the follower messages file ****;
Data Temp1B;
Set ST.USD_Panel;
Format Date2 Date8.;

UserID2 = UserID;

Date2 = Date;

Bear2 = Bear;

Bull2 = Bull;

SymbolID2 = SymbolID;

Keep Date2 UserID2 Bear2 Bull2 SymbolID2;
run;



***** Temp1B is at the user-day-stock level, Also make a file which is at the user-day-stock-message level ****;

Data Temp1c;
Set ST.Master_Small;

If Date >= MDY(1, 1, 2013);  			*** Start with Jan 1 2013 ;

If Bear ne . or Bull ne .;   			*** Only consider sentiment messages;

If SymbolID ne . and Num_Symbols=1;		*** Only consider messages about a single symbol;

Keep Date Bull Bear UserID SymbolID MessageID;
run;

Proc Sql;
Create Table Temp1C AS
SELECT *
FROM Temp1C a LEFT JOIN Robots b
ON (a.UserID = b.UserID);
QUIT; RUN;

Proc Sql;
Create Table Temp1C AS
SELECT *
FROM Temp1C a LEFT JOIN Tiny_Stocks b
ON (a.SymbolID = b.SymbolID);
QUIT; RUN;

Data Temp1C;
Set Temp1C;

If Robot = 1 then delete;

If Tiny_Stock = 1 then delete;

run;

Data Temp1C;
Set Temp1C;
Format Date2 Date8.;

UserID2 = UserID;

Date2 = Date;

Bear2 = Bear;

Bull2 = Bull;

SymbolID2 = SymbolID;

MessageID2 = MessageID;

Keep Date2 UserID2 Bear2 Bull2 SymbolID2 MessageID2;
run;






*****************************************************************************************************************
*****************************************************************************************************************

5. Create three datasets: (1) Friendship creation (2) Friendship destruction and (3) Like creation

*****************************************************************************************************************
*****************************************************************************************************************;


filename FT77F001 "E:\StockTwits\Activity_Files\stocktwits_activity*";

**** Likes Create Data ****;
Data ST.Likes_Create;
INFILE FT77F001 dsd DLM='*' dsd dlmstr='}';
informat var $2056.;
input var $ @@;

Format Object $12. ID 12. MessageID 12. UserID 12. LikedUserID 12. Date Date8. Time Time8. Lag1var $2056.;

Lag1var = lag1(var);

Part1 = Scan(Lag1var, 1, ":");
Part2 = Scan(Lag1var, 2, ":");
Part3 = Scan(Lag1var, 3, ":");
Part4 = Scan(Lag1var, 4, ":");
Part5 = Scan(Lag1var, 5, ":");
Part6 = Scan(Lag1var, 6, ":");
Part7 = Scan(Lag1var, 7, ":");
Part8 = Scan(Lag1var, 8, ":");
Part9 = Scan(Lag1var, 9, ":");
Part10 = Scan(Lag1var, 10, ":");
Part11 = Scan(Lag1var, 11, ":");

If index(Lag1var, '"object":"LikeMessage"') ne 0 and index(Part3, "create") ne 0 then do;

Object = trim(left( compress(scan(Part2, 1, ","), '"') ));
ID = Input( scan(Part5, 1, ","), 12.);
MessageID = Input( scan(Part6, 1, ","), 12.);
UserID = Input( scan(Part7, 1, ","), 12.);
LikedUserID = Input( scan(Part8, 1, ","), 12.);

Year = Input( scan(Part9, 1, '"-TZ'), 12.);
Month = Input( scan(Part9, 2, '"-TZ'), 12.);
Day = Input( scan(Part9, 3, '"-TZ'), 12.);

Hour = Input( scan(Part9, 4, '"-TZ'), 12.);
Minute = Input( scan(Part10, 1, '"-TZ'), 12.);
Second = Input( scan(Part11, 1, '"-TZ'), 12.);

Date = MDY(Month, Day, Year);

Time = HMS(Hour, Minute, Second);

output;

end;

Keep ID MessageID UserID LikedUserID Date Time;

run;

**** Friends Create Data ****;
Data ST.Friends_Create;
INFILE FT77F001 dsd DLM='*' dsd dlmstr='}';
informat var $2056.;
input var $ @@;

Format Object $12. ID 12. UserID 12. FollowingUserID 12. Date Date8. Time Time8. Lag1var $2056.;

Lag1var = lag1(var);

Part1 = Scan(Lag1var, 1, ":");
Part2 = Scan(Lag1var, 2, ":");
Part3 = Scan(Lag1var, 3, ":");
Part4 = Scan(Lag1var, 4, ":");
Part5 = Scan(Lag1var, 5, ":");
Part6 = Scan(Lag1var, 6, ":");
Part7 = Scan(Lag1var, 7, ":");
Part8 = Scan(Lag1var, 8, ":");
Part9 = Scan(Lag1var, 9, ":");
Part10 = Scan(Lag1var, 10, ":");
Part11 = Scan(Lag1var, 11, ":");


If index(Lag1var, '"object":"Friendship"') and index(Part3, "create") ne 0 then do;

Object = trim(left( compress(scan(Part2, 1, ","), '"') ));
ID = Input( scan(Part5, 1, ","), 12.);
UserID = Input( scan(Part6, 1, ","), 12.);
FollowingUserID = Input( scan(Part7, 1, ","), 12.);

Year = Input( scan(Part8, 1, '"-TZ'), 12.);
Month = Input( scan(Part8, 2, '"-TZ'), 12.);
Day = Input( scan(Part8, 3, '"-TZ'), 12.);

Hour = Input( scan(Part8, 4, '"-TZ'), 12.);
Minute = Input( scan(Part9, 1, '"-TZ'), 12.);
Second = Input( scan(Part10, 1, '"-TZ'), 12.);

Date = MDY(Month, Day, Year);

Time = HMS(Hour, Minute, Second);

output;

end;

Keep ID UserID FollowingUserID Date Time;

run;

**** Friends Destroy Data ****;
Data ST.Friends_Destroy;
INFILE FT77F001 dsd DLM='*' dsd dlmstr='}';
informat var $2056.;
input var $ @@;

Format Object $12. ID 12. UserID 12. FollowingUserID 12. Date Date8. Time Time8.
Lag1var $2056. Date_Destroyed Date8. Time_Destroyed Time8.;

Lag1var = lag1(var);

Part1 = Scan(Lag1var, 1, ":");
Part2 = Scan(Lag1var, 2, ":");
Part3 = Scan(Lag1var, 3, ":");
Part4 = Scan(Lag1var, 4, ":");
Part5 = Scan(Lag1var, 5, ":");
Part6 = Scan(Lag1var, 6, ":");
Part7 = Scan(Lag1var, 7, ":");
Part8 = Scan(Lag1var, 8, ":");
Part9 = Scan(Lag1var, 9, ":");
Part10 = Scan(Lag1var, 10, ":");
Part11 = Scan(Lag1var, 11, ":");

If index(Lag1var, '"object":"Friendship"') and index(Part3, "destroy") ne 0 then do;

Object = trim(left( compress(scan(Part2, 1, ","), '"') ));
ID = Input( scan(Part5, 1, ","), 12.);
UserID = Input( scan(Part6, 1, ","), 12.);
FollowingUserID = Input( scan(Part7, 1, ","), 12.);

Year = Input( scan(Part8, 1, '"-TZ'), 12.);
Month = Input( scan(Part8, 2, '"-TZ'), 12.);
Day = Input( scan(Part8, 3, '"-TZ'), 12.);

Hour = Input( scan(Part8, 4, '"-TZ'), 12.);
Minute = Input( scan(Part9, 1, '"-TZ'), 12.);
Second = Input( scan(Part10, 1, '"-TZ'), 12.);

Date = MDY(Month, Day, Year);

Time = HMS(Hour, Minute, Second);

Year2 = Input( scan(var, 3, '"-TZ:+'), 12.);
Month2 = Input( scan(var, 4, '"-TZ:+'), 12.);
Day2 = Input( scan(var, 5, '"-TZ:'), 12.);

Hour2 = Input( scan(var, 6, '"-TZ:+'), 12.);
Minute2 = Input( scan(var, 7, '"-TZ:+'), 12.);
Second2 = Input( scan(var, 8, '"-TZ:+'), 12.);

Date_Destroyed = MDY(Month2, Day2, Year2);

Time_Destroyed = HMS(Hour2, Minute2, Second2);

output;

end;

Keep ID UserID FollowingUserID Date Time Date_Destroyed Time_Destroyed;

run;




*****************************************************************************************************************
*****************************************************************************************************************

6. Now create a dataset of following: when it starts and when it ends (if ever)

*****************************************************************************************************************
*****************************************************************************************************************;

***** Start with friendship creation ****;
Data Temp2;
Set ST.Friends_create;
Keep ID UserID FollowingUserID Date;
run;

***** Get the dataset of all friendship destruction ****;
Data Temp3;
Set ST.Friends_Destroy;
Format Date2 Date8.;
Keep ID Date_Destroyed;
run;

***** Merge the Detruction Dataset onto the Creation Dataset to see when each instance ends (if ever)  ****;
Proc Sql;
Create Table Temp4 AS
SELECT *
FROM Temp2 a LEFT JOIN Temp3 b
ON (a.ID = b.ID);
QUIT; RUN;

Data Temp4;
Set Temp4;
Format Date_Created Date8.;

If Date_Destroyed = . then Date_Destroyed = MDY(1, 1, 2030);  *** Need to make a future date for the not-yet destroyeds;

Date_Created = Date;

Drop Date;
run;

************************************************************
**** Fix Errors with Created and Destroyed Timing ****;

Proc Sort Data=Temp4; by UserID FollowingUserID Date_Created Date_Destroyed; run;

Data Temp4;
Set Temp4;
Format Max1 Date8.;
Retain Max1;

Lag1UserID = lag1(UserID);
Lag1FollowingUserID = lag1(FollowingUserID);

If UserID ne Lag1UserID OR FollowingUserID ne Lag1FollowingUserID then do; Max1 = Date_Destroyed; end;

If UserID = Lag1UserID AND FollowingUserID = Lag1FollowingUserID then do; Max1 = Max(Max1, Date_Destroyed); end;

Drop Lag1UserID Lag1FollowingUserID;
run;

Data Temp4;
Set Temp4;

If Max1 > Date_Destroyed then delete;

Drop Max1;
run;

Proc Sort Data=Temp4; by UserID FollowingUserID Date_Created Date_Destroyed; run;

Data Temp4;
Set Temp4;
Format Lag1Date_Created Date8. Lag1Date_Destroyed Date8.;

Lag1Date_Created = lag1(Date_Created);
Lag1Date_Destroyed = lag1(Date_Destroyed);
Lag1UserID = lag1(UserID);
Lag1FollowingUserID = lag1(FollowingUserID);

If UserID = Lag1UserID and FollowingUserID = Lag1FollowingUserID and Date_Created < Lag1Date_Destroyed then do;

Date_Created = Lag1Date_Destroyed;

end;  *** Push the creation event to destruction event in overlap cases like this ***;

Drop Lag1Date_Created Lag1Date_Destroyed Lag1UserID Lag1FollowingUserID;
run;


Proc Sort Data=Temp4; by UserID FollowingUserID Date_Created Date_Destroyed; run;

Data Temp4;
Set Temp4;
Format Lag1Date_Created Date8. Lag1Date_Destroyed Date8.;

Lag1Date_Created = lag1(Date_Created);
Lag1Date_Destroyed = lag1(Date_Destroyed);
Lag1UserID = lag1(UserID);
Lag1FollowingUserID = lag1(FollowingUserID);

If UserID = Lag1UserID and FollowingUserID = Lag1FollowingUserID and Date_Created < Lag1Date_Destroyed then do;

Date_Created = Lag1Date_Destroyed;

end;  *** Push the creation event to destruction event in overlap cases like this ***;

Drop Lag1Date_Created Lag1Date_Destroyed Lag1UserID Lag1FollowingUserID;
run;

Proc Sort Data=Temp4; by UserID FollowingUserID Date_Created Date_Destroyed; run;

Data Temp4;
Set Temp4;
Format Lag1Date_Created Date8. Lag1Date_Destroyed Date8.;

Lag1Date_Created = lag1(Date_Created);
Lag1Date_Destroyed = lag1(Date_Destroyed);
Lag1UserID = lag1(UserID);
Lag1FollowingUserID = lag1(FollowingUserID);

If UserID = Lag1UserID and FollowingUserID = Lag1FollowingUserID and Date_Created < Lag1Date_Destroyed then do;

Date_Created = Lag1Date_Destroyed;

end;  *** Push the creation event to destruction event in overlap cases like this ***;

Drop Lag1Date_Created Lag1Date_Destroyed Lag1UserID Lag1FollowingUserID;
run;

Proc Sort Data=Temp4; by UserID FollowingUserID Date_Created Date_Destroyed; run;

Data Temp4;
Set Temp4;
Format Lag1Date_Created Date8. Lag1Date_Destroyed Date8.;

Lag1Date_Created = lag1(Date_Created);
Lag1Date_Destroyed = lag1(Date_Destroyed);
Lag1UserID = lag1(UserID);
Lag1FollowingUserID = lag1(FollowingUserID);

If UserID = Lag1UserID and FollowingUserID = Lag1FollowingUserID and Date_Created < Lag1Date_Destroyed then do;

Date_Created = Lag1Date_Destroyed;

end;  *** Push the creation event to destruction event in overlap cases like this ***;

Drop Lag1Date_Created Lag1Date_Destroyed Lag1UserID Lag1FollowingUserID;
run;

Data Temp4;
Set Temp4;

If Date_Created > Date_Destroyed then delete;

run;
**** Fix Errors with Created and Destroyed Timing ****;
************************************************************;

Data ST.Following;
Set Temp4;
run;



*****************************************************************************************************************
*****************************************************************************************************************

6. Now, using a looping structure, find number of bullish/bearish impressions over the next 50 days 
	as a function of day-0 bullish or bearish declaration

	Also, make the dataset for the regression

*****************************************************************************************************************
*****************************************************************************************************************;

**** Look over next 50 days ****;

Data Main;
Set ST.USD_Panel;
Format Date_Begin Date8. Date_End Date8.;

Date_Begin = Date + 1;
Date_End = Date + 50;

run;

Proc printto log=junk; run;

%MACRO mfmatch;

%DO b=0 %TO 99;

Data Sub&b;
Set Main;
where mod(UserID, 100) = &b;
run;


Proc Sql;
Create Table Sub&b AS

SELECT a.Date, a.SymbolID, a.UserID, c.UserID2, a.Bull, a.Bear, c.Bear2, c.Bull2, c.Date2

FROM Sub&b a LEFT JOIN ST.Following b

ON (a.UserID = b.UserID)

LEFT JOIN Temp1B c

ON (b.FollowingUserID = c.UserId2) AND (a.SymbolID = c.SymbolID2)

AND (a.Date_Begin <= c.Date2) AND (a.Date_End >= c.Date2)

AND (c.Date2 > b.Date_Created) AND (c.Date2 < b.Date_Destroyed) 

WHERE (c.Bull2 = 1 OR c.Bear2 = 1)
;
QUIT; RUN;



%End;

Data Cumulative;
Set Sub0-Sub99; run;

%End;


%MEND mfmatch;

%mfmatch ;
Run;

Proc printto; run;


Data ET1;
Set Cumulative;

Event_Time = Date2 - Date;

run;

PROC SQL;
Create Table ET2 AS
  SELECT Sum(Bull2) AS Sum_Bull, Sum(Bear2) AS Sum_Bear, Bull, Bear, Event_Time, UserID, SymbolID, Date
    FROM ET1
      GROUP BY UserID, SymbolID, Date, Bull, Bear, Event_Time;
QUIT;
Run;

***************************************;
***** BUILD Regression Dataset ****;

Data Regression;
Set ST.USD_Panel;
Format Date2 Date8.;

Do i = 1 to 50;
Event_Time = i; 
Date2 = Date + i;
output;
end;

Drop i;
run;

Proc Sql;
Create Table Regression AS
SELECT *
FROM Regression a LEFT JOIN ET2 b
ON (a.UserID = b.UserID) AND (a.SymbolID = b.SymbolID) AND (a.Date = b.Date) AND
(a.Event_Time = b.Event_Time);
QUIT; RUN;

Data ST.Reg_Info_Flow;
Set Regression;

If Sum_Bull = . then Sum_Bull = 0;
If Sum_Bear = . then Sum_Bear = 0;

If Date2 > MDY(6, 30, 2020) then delete;  **** Only keep event times that are possible ****;

run;

***************************************;
***** Corresponding Figure ****;

Proc Sort Data=ST.Reg_Info_Flow;
by Bull Bear Event_Time; run;

Proc Univariate Data=ST.Reg_Info_Flow noprint;
by Bull Bear Event_Time;
Var Sum_Bull Sum_Bear;
output out=ST.Figure4 sum=Sum_Bull Sum_Bear mean=Mean_Bull Mean_Bear N=N1 N2; run;


***************************************;
***** Build Final Regression Dataset with columns for ED 1, 2, ....10, 11-20, 21-30, 31-40, 41-50 ****;

%MACRO mfmatch;

%DO b=1 %TO 10;

Data Play&b ; Set ST.Reg_Info_Flow; 
If Event_Time = &b; Sum_Bull_&b = Sum_Bull; Sum_Bear_&b = Sum_Bear; 
Keep UserID SymbolID Date Sum_Bull_&b Sum_Bear_&b;
run;

%End;

%End;


%MEND mfmatch;

%mfmatch ;
Run;

Data Play11 ; Set ST.Reg_Info_Flow; 
If Event_Time >= 11 and Event_Time <= 20; Sum_Bull_11_20 = Sum_Bull; Sum_Bear_11_20 = Sum_Bear; 
Keep UserID SymbolID Date Sum_Bull_11_20 Sum_Bear_11_20;
run;

Proc Sort Data=Play11; by UserID SymbolID Date; run;
Proc Univariate Data=Play11 noprint; by UserID SymbolID Date;
Var Sum_Bull_11_20 Sum_Bear_11_20; output out=Play11 Sum=Sum_Bull_11_20 Sum_Bear_11_20; run;

Data Play12 ; Set ST.Reg_Info_Flow; 
If Event_Time >= 21 and Event_Time <= 30; Sum_Bull_21_30 = Sum_Bull; Sum_Bear_21_30 = Sum_Bear; 
Keep UserID SymbolID Date Sum_Bull_21_30 Sum_Bear_21_30;
run;

Proc Sort Data=Play12; by UserID SymbolID Date; run;
Proc Univariate Data=Play12 noprint; by UserID SymbolID Date;
Var Sum_Bull_21_30 Sum_Bear_21_30; output out=Play12 Sum=Sum_Bull_21_30 Sum_Bear_21_30; run;

Data Play13 ; Set ST.Reg_Info_Flow; 
If Event_Time >= 31 and Event_Time <= 40; Sum_Bull_31_40 = Sum_Bull; Sum_Bear_31_40 = Sum_Bear; 
Keep UserID SymbolID Date Sum_Bull_31_40 Sum_Bear_31_40;
run;

Proc Sort Data=Play13; by UserID SymbolID Date; run;
Proc Univariate Data=Play13 noprint; by UserID SymbolID Date;
Var Sum_Bull_31_40 Sum_Bear_31_40; output out=Play13 Sum=Sum_Bull_31_40 Sum_Bear_31_40; run;

Data Play14 ; Set ST.Reg_Info_Flow; 
If Event_Time >= 41 and Event_Time <= 50; Sum_Bull_41_50 = Sum_Bull; Sum_Bear_41_50 = Sum_Bear; 
Keep UserID SymbolID Date Sum_Bull_41_50 Sum_Bear_41_50;
run;

Proc Sort Data=Play14; by UserID SymbolID Date; run;
Proc Univariate Data=Play14 noprint; by UserID SymbolID Date;
Var Sum_Bull_41_50 Sum_Bear_41_50; output out=Play14 Sum=Sum_Bull_41_50 Sum_Bear_41_50; run;


**** Now put it altogether **;
Data Play; Set ST.Reg_Info_Flow; If Event_Time = 1;
Keep UserID SymbolID Date Bear Bull;
run;

%MACRO mfmatch;

%DO b=1 %TO 14;

Proc Sql;
Create Table Play AS SELECT *
FROM Play a LEFT JOIN Play&b b ON (a.UserID = b.UserID) and (a.SymbolID = b.SymbolID) AND (a.Date = b.Date); QUIT; RUN;

%End;

%End;

%MEND mfmatch;

%mfmatch ;
Run;

Data Echo.FW_Info_Flow;
Set Play; run;

*****************************************************************************************************************
*****************************************************************************************************************

7. Identical step as before but now with MESSAGES (rather than user-day-symbol impressions)

*****************************************************************************************************************
*****************************************************************************************************************;

Data Main;
Set ST.USD_Panel;
Format Date_Begin Date8. Date_End Date8.;

Date_Begin = Date + 1;
Date_End = Date + 50;

run;

Proc printto log=junk; run;

%MACRO mfmatch;

%DO b=0 %TO 99;

Data Sub&b;
Set Main;
where mod(UserID, 100) = &b;
run;


Proc Sql;
Create Table Sub&b AS

SELECT a.Date, a.SymbolID, a.UserID, c.UserID2, a.Bull, a.Bear, c.Bear2, c.Bull2, c.Date2, c.MessageID2

FROM Sub&b a LEFT JOIN ST.Following b

ON (a.UserID = b.UserID)

LEFT JOIN Temp1C c

ON (b.FollowingUserID = c.UserId2) AND (a.SymbolID = c.SymbolID2)

AND (a.Date_Begin <= c.Date2) AND (a.Date_End >= c.Date2)

AND (c.Date2 > b.Date_Created) AND (c.Date2 < b.Date_Destroyed) 

WHERE (c.Bull2 = 1 OR c.Bear2 = 1)
;
QUIT; RUN;

%End;

Data Cumulative;
Set Sub0-Sub99; run;

%MEND mfmatch;

%mfmatch ;
Run;

Proc printto; run;


Data ET1C;
Set Cumulative;

Event_Time = Date2 - Date;

run;

PROC SQL;
Create Table ET2C AS
  SELECT Sum(Bull2) AS Sum_Bull, Sum(Bear2) AS Sum_Bear, Bull, Bear, Event_Time, UserID, SymbolID, Date
    FROM ET1C
      GROUP BY UserID, SymbolID, Date, Bull, Bear, Event_Time;
QUIT;
Run;

***************************************;
***** BUILD Regression Dataset ****;

Data Regression;
Set ST.USD_Panel;
Format Date2 Date8.;

Do i = 1 to 50;
Event_Time = i; 
Date2 = Date + i;
output;
end;

Drop i;
run;

Proc Sql;
Create Table Regression AS
SELECT *
FROM Regression a LEFT JOIN ET2C b
ON (a.UserID = b.UserID) AND (a.SymbolID = b.SymbolID) AND (a.Date = b.Date) AND
(a.Event_Time = b.Event_Time);
QUIT; RUN;

Data ST.Reg_Info_Flow_M;
Set Regression;

If Sum_Bull = . then Sum_Bull = 0;
If Sum_Bear = . then Sum_Bear = 0;

If Date2 > MDY(6, 30, 2020) then delete;  **** Only keep event times that are possible ****;

run;

***************************************;
***** Corresponding Figure ****;

Proc Sort Data=ST.Reg_Info_Flow_M;
by Bull Bear Event_Time; run;

Proc Univariate Data=ST.Reg_Info_Flow_M noprint;
by Bull Bear Event_Time;
Var Sum_Bull Sum_Bear;
output out=ST.Figure3 sum=Sum_Bull Sum_Bear mean=Mean_Bull Mean_Bear N=N1 N2; run;



***************************************;
***** Build Final Regression Dataset with columns for ED 1, 2, ....10, 11-20, 21-30, 31-40, 41-50 ****;

%MACRO mfmatch;

%DO b=1 %TO 10;

Data Play&b ; Set ST.Reg_Info_Flow_m; 
If Event_Time = &b; Sum_Bull_&b = Sum_Bull; Sum_Bear_&b = Sum_Bear; 
Keep UserID SymbolID Date Sum_Bull_&b Sum_Bear_&b;
run;

%End;

%End;


%MEND mfmatch;

%mfmatch ;
Run;

Data Play11 ; Set ST.Reg_Info_Flow_m; 
If Event_Time >= 11 and Event_Time <= 20; Sum_Bull_11_20 = Sum_Bull; Sum_Bear_11_20 = Sum_Bear; 
Keep UserID SymbolID Date Sum_Bull_11_20 Sum_Bear_11_20;
run;

Proc Sort Data=Play11; by UserID SymbolID Date; run;
Proc Univariate Data=Play11 noprint; by UserID SymbolID Date;
Var Sum_Bull_11_20 Sum_Bear_11_20; output out=Play11 Sum=Sum_Bull_11_20 Sum_Bear_11_20; run;

Data Play12 ; Set ST.Reg_Info_Flow_m; 
If Event_Time >= 21 and Event_Time <= 30; Sum_Bull_21_30 = Sum_Bull; Sum_Bear_21_30 = Sum_Bear; 
Keep UserID SymbolID Date Sum_Bull_21_30 Sum_Bear_21_30;
run;

Proc Sort Data=Play12; by UserID SymbolID Date; run;
Proc Univariate Data=Play12 noprint; by UserID SymbolID Date;
Var Sum_Bull_21_30 Sum_Bear_21_30; output out=Play12 Sum=Sum_Bull_21_30 Sum_Bear_21_30; run;

Data Play13 ; Set ST.Reg_Info_Flow_m; 
If Event_Time >= 31 and Event_Time <= 40; Sum_Bull_31_40 = Sum_Bull; Sum_Bear_31_40 = Sum_Bear; 
Keep UserID SymbolID Date Sum_Bull_31_40 Sum_Bear_31_40;
run;

Proc Sort Data=Play13; by UserID SymbolID Date; run;
Proc Univariate Data=Play13 noprint; by UserID SymbolID Date;
Var Sum_Bull_31_40 Sum_Bear_31_40; output out=Play13 Sum=Sum_Bull_31_40 Sum_Bear_31_40; run;

Data Play14 ; Set ST.Reg_Info_Flow_m; 
If Event_Time >= 41 and Event_Time <= 50; Sum_Bull_41_50 = Sum_Bull; Sum_Bear_41_50 = Sum_Bear; 
Keep UserID SymbolID Date Sum_Bull_41_50 Sum_Bear_41_50;
run;

Proc Sort Data=Play14; by UserID SymbolID Date; run;
Proc Univariate Data=Play14 noprint; by UserID SymbolID Date;
Var Sum_Bull_41_50 Sum_Bear_41_50; output out=Play14 Sum=Sum_Bull_41_50 Sum_Bear_41_50; run;


**** Now put it altogether **;
Data Play; Set ST.Reg_Info_Flow_m; If Event_Time = 1;
Keep UserID SymbolID Date Bear Bull;
run;

%MACRO mfmatch;

%DO b=1 %TO 14;

Proc Sql;
Create Table Play AS SELECT *
FROM Play a LEFT JOIN Play&b b ON (a.UserID = b.UserID) and (a.SymbolID = b.SymbolID) AND (a.Date = b.Date); QUIT; RUN;

%End;

%End;

%MEND mfmatch;

%mfmatch ;
Run;

Data Echo.FW_Info_Flow_m;
Set Play; run;


*****************************************************************************************************************
*****************************************************************************************************************

8. Now do the new and lost follows

*****************************************************************************************************************
*****************************************************************************************************************;

*** First Do New Ones ***;
Data Main;
Set ST.USD_Panel;
Format Date_Begin Date8. Date_End Date8.;

Date_Begin = Date + 1;
Date_End = Date + 50;

run;


Proc printto log=junk; run;

%MACRO mfmatch;

%DO b=0 %TO 99;

Data Sub&b;
Set Main;
where mod(UserID, 100) = &b;
run;


Proc Sql;
Create Table Sub&b AS

SELECT a.Date, a.SymbolID, a.UserID, c.UserID2, a.Bull, a.Bear, c.Bear2, c.Bull2, c.Date2

FROM Sub&b a LEFT JOIN ST.Following b

ON (a.UserID = b.UserID)

LEFT JOIN Temp1B c

ON (a.SymbolID = c.SymbolID2) AND (a.Date_Begin <= c.Date2) AND (a.Date_End >= c.Date2)

AND (c.UserId2 = b.FollowingUserID ) AND (c.Date2 = b.Date_Created)

WHERE (a.UserID ne . AND c.SymbolID2 ne . AND b.Date_Created ne .)
;
QUIT; RUN;

%End;

Data Cumulative;
Set Sub0-Sub99; run;

%MEND mfmatch;

%mfmatch ;
Run;

Proc printto; run;

Data Cumulative;
Set Cumulative;

Event_Time = Date2 - Date;

run;

Proc Sort Data=Cumulative; by UserID SymbolID Date Bear Bull Event_Time; run;

Proc Univariate Data=Cumulative noprint;
by UserID SymbolID Date Bear Bull Event_Time;
Var Bull2 Bear2;
output out=New_Cumulative sum=New_Bulls New_Bears; run;





*** Now do the Lost Ones ***;
Data Main;
Set ST.USD_Panel;
Format Date_Begin Date8. Date_End Date8.;

Date_Begin = Date + 1;
Date_End = Date + 50;

run;


Proc printto log=junk; run;

%MACRO mfmatch;

%DO b=0 %TO 99;

Data Sub&b;
Set Main;
where mod(UserID, 100) = &b;
run;


Proc Sql;
Create Table Sub&b AS

SELECT a.Date, a.SymbolID, a.UserID, c.UserID2, a.Bull, a.Bear, c.Bear2, c.Bull2, c.Date2

FROM Sub&b a LEFT JOIN ST.Following b

ON (a.UserID = b.UserID)

LEFT JOIN Temp1B c

ON (a.SymbolID = c.SymbolID2) AND (a.Date_Begin <= c.Date2) AND (a.Date_End >= c.Date2)

AND (c.UserId2 = b.FollowingUserID ) AND (c.Date2 = b.Date_Destroyed)

WHERE (a.UserID ne . AND c.SymbolID2 ne . AND b.Date_Destroyed ne .)
;
QUIT; RUN;

%End;

Data Cumulative;
Set Sub0-Sub99; run;

%MEND mfmatch;

%mfmatch ;
Run;

Proc printto; run;

Data Cumulative;
Set Cumulative;

Event_Time = Date2 - Date;

run;

Proc Sort Data=Cumulative; by UserID SymbolID Date Bear Bull Event_Time; run;

Proc Univariate Data=Cumulative noprint;
by UserID SymbolID Date Bear Bull Event_Time;
Var Bull2 Bear2;
output out=Lost_Cumulative sum=Lost_Bulls Lost_Bears; run;


***************************************;
***** BUILD Regression Dataset ****;

Data Regression;
Set ST.USD_Panel;
Format Date2 Date8.;

Do i = 1 to 50;
Event_Time = i; 
Date2 = Date + i;
output;
end;

Drop i;
run;

Proc Sql;
Create Table Regression AS
SELECT *
FROM Regression a LEFT JOIN New_Cumulative b
ON (a.UserID = b.UserID) AND (a.SymbolID = b.SymbolID) AND (a.Date = b.Date) AND
(a.Event_Time = b.Event_Time);
QUIT; RUN;

Proc Sql;
Create Table Regression AS
SELECT *
FROM Regression a LEFT JOIN Lost_Cumulative b
ON (a.UserID = b.UserID) AND (a.SymbolID = b.SymbolID) AND (a.Date = b.Date) AND
(a.Event_Time = b.Event_Time);
QUIT; RUN;

Data ST.Reg_New_Follows;
Set Regression;

If New_Bulls = . then New_Bulls = 0;
If New_Bears = . then New_Bears = 0;

If Lost_Bulls = . then Lost_Bulls = 0;
If Lost_Bears = . then Lost_Bears = 0;

Net_Bulls = New_Bulls - Lost_Bulls;
Net_Bears = New_Bears - Lost_Bears;

If Date2 > MDY(6, 30, 2020) then delete;  **** Only keep event times that are possible ****;

run;


***************************************;
***** Corresponding Figure ****;

Proc Sort Data=ST.Reg_New_Follows;
by Bull Bear Event_Time; run;

Proc Univariate Data=ST.Reg_New_Follows noprint;
by Bull Bear Event_Time;
Var Net_Bulls Net_Bears;
output out=ST.Figure2 sum=Sum_Bull Sum_Bear mean=Mean_Bull Mean_Bear N=N1 N2; run;



***************************************;
***** Build Final Regression Dataset with columns for ED 1, 2, ....10, 11-20, 21-30, 31-40, 41-50 ****;

%MACRO mfmatch;

%DO b=1 %TO 10;

Data Play&b ; Set ST.Reg_new_follows; 
If Event_Time = &b; Net_Bulls_&b = Net_Bulls; Net_Bears_&b = Net_Bears; 
Keep UserID SymbolID Date Net_Bulls_&b Net_Bears_&b;
run;

%End;

%End;


%MEND mfmatch;

%mfmatch ;
Run;

Data Play11 ; Set ST.Reg_new_follows; 
If Event_Time >= 11 and Event_Time <= 20; Net_Bulls_11_20 = Net_Bulls; Net_Bears_11_20 = Net_Bears; 
Keep UserID SymbolID Date Net_Bulls_11_20 Net_Bears_11_20;
run;

Proc Sort Data=Play11; by UserID SymbolID Date; run;
Proc Univariate Data=Play11 noprint; by UserID SymbolID Date;
Var Net_Bulls_11_20 Net_Bears_11_20; output out=Play11 Sum=Net_Bulls_11_20 Net_Bears_11_20; run;

Data Play12 ; Set ST.Reg_new_follows; 
If Event_Time >= 21 and Event_Time <= 30; Net_Bulls_21_30 = Net_Bulls; Net_Bears_21_30 = Net_Bears; 
Keep UserID SymbolID Date Net_Bulls_21_30 Net_Bears_21_30;
run;

Proc Sort Data=Play12; by UserID SymbolID Date; run;
Proc Univariate Data=Play12 noprint; by UserID SymbolID Date;
Var Net_Bulls_21_30 Net_Bears_21_30; output out=Play12 Sum=Net_Bulls_21_30 Net_Bears_21_30; run;

Data Play13 ; Set ST.Reg_new_follows; 
If Event_Time >= 31 and Event_Time <= 40; Net_Bulls_31_40 = Net_Bulls; Net_Bears_31_40 = Net_Bears; 
Keep UserID SymbolID Date Net_Bulls_31_40 Net_Bears_31_40;
run;

Proc Sort Data=Play13; by UserID SymbolID Date; run;
Proc Univariate Data=Play13 noprint; by UserID SymbolID Date;
Var Net_Bulls_31_40 Net_Bears_31_40; output out=Play13 Sum=Net_Bulls_31_40 Net_Bears_31_40; run;

Data Play14 ; Set ST.Reg_new_follows; 
If Event_Time >= 41 and Event_Time <= 50; Net_Bulls_41_50 = Net_Bulls; Net_Bears_41_50 = Net_Bears; 
Keep UserID SymbolID Date Net_Bulls_41_50 Net_Bears_41_50;
run;

Proc Sort Data=Play14; by UserID SymbolID Date; run;
Proc Univariate Data=Play14 noprint; by UserID SymbolID Date;
Var Net_Bulls_41_50 Net_Bears_41_50; output out=Play14 Sum=Net_Bulls_41_50 Net_Bears_41_50; run;


**** Now put it altogether **;
Data Play; Set ST.Reg_new_follows; If Event_Time = 1;
Keep UserID SymbolID Date Bear Bull;
run;

%MACRO mfmatch;

%DO b=1 %TO 14;

Proc Sql;
Create Table Play AS SELECT *
FROM Play a LEFT JOIN Play&b b ON (a.UserID = b.UserID) and (a.SymbolID = b.SymbolID) AND (a.Date = b.Date); QUIT; RUN;

%End;

%End;

%MEND mfmatch;

%mfmatch ;
Run;

Data Echo.FW_new_follows;
Set Play; run;




*****************************************************************************************************************
*****************************************************************************************************************

9. Now do the same analysis for Likes

*****************************************************************************************************************
*****************************************************************************************************************;

** First, prepare the Likes file ***;
Data Likes2;
Set ST.Likes_create;
Format Date2 Date8.;

Date2 = Date;

Drop Date Time ID;
run;

Data Likes3;
Set Temp1c;

MessageID = MessageID2;
SymbolID = SymbolID2;

Keep MessageID SymbolID Bull2 Bear2;
run;

Proc Sql;
Create Table Likes3 AS
SELECT *
FROM Likes2 a LEFT JOIN Likes3 b
ON (a.MessageID = b.MessageID)
WHERE (a.MessageID ne . and b.MessageID ne .)
;
QUIT; RUN;

Proc Sort Data=Likes3 nodupkey; by MessageID UserID LikedUserID; run;   *** I checked and there are mistake duplicates in the original Likes_create file, this fixes that ****;


*** Now Build the main dataset and merge **;

Data Main;
Set ST.USD_Panel;
Format Date_Begin Date8. Date_End Date8.;

Date_Begin = Date + 1;
Date_End = Date + 50;

run;

Proc printto log=junk; run;

%MACRO mfmatch;

%DO b=0 %TO 99;

Data Sub&b;
Set Main;
where mod(UserID, 100) = &b;
run;


Proc Sql;
Create Table Sub&b AS

SELECT *

FROM Sub&b a LEFT JOIN Likes3 b

ON (a.UserID = b.UserID) AND (a.SymbolID = b.SymbolID) AND (a.Date_Begin <= b.Date2) AND (a.Date_End >= b.Date2) 

WHERE (a.UserID ne . AND b.LikedUserID ne .)
;
QUIT; RUN;

%End;

Data Cumulative;
Set Sub0-Sub99; run;

%MEND mfmatch;

%mfmatch ;
Run;

Proc printto; run;


Data ET1C;
Set Cumulative;

Event_Time = Date2 - Date;

run;

PROC SQL;
Create Table ET2C AS
  SELECT Sum(Bull2) AS Sum_Bull_Likes, Sum(Bear2) AS Sum_Bear_Likes, Bull, Bear, Event_Time, UserID, SymbolID, Date
    FROM ET1C
      GROUP BY UserID, SymbolID, Date, Bull, Bear, Event_Time;
QUIT;
Run;

***************************************;
***** BUILD Regression Dataset ****;

Data Regression;
Set ST.USD_Panel;
Format Date2 Date8.;

Do i = 1 to 50;
Event_Time = i; 
Date2 = Date + i;
output;
end;

Drop i;
run;

Proc Sql;
Create Table Regression AS
SELECT *
FROM Regression a LEFT JOIN ET2C b
ON (a.UserID = b.UserID) AND (a.SymbolID = b.SymbolID) AND (a.Date = b.Date) AND
(a.Event_Time = b.Event_Time);
QUIT; RUN;

Data ST.Reg_Likes;
Set Regression;

If Sum_Bull_Likes = . then Sum_Bull_Likes = 0;
If Sum_Bear_Likes = . then Sum_Bear_Likes = 0;

If Date2 > MDY(6, 30, 2020) then delete;  **** Only keep event times that are possible ****;

run;

***************************************;
***** Corresponding Figure ****;

Proc Sort Data=ST.Reg_Likes;
by Bull Bear Event_Time; run;

Proc Univariate Data=ST.Reg_Likes noprint;
by Bull Bear Event_Time;
Var Sum_Bull_Likes Sum_Bear_Likes;
output out=ST.Figure5 sum=Sum_Bull_Likes Sum_Bear_Likes mean=Mean_Bull Mean_Bear N=N1 N2; run;


***************************************;
***** Build Final Regression Dataset with columns for ED 1, 2, ....10, 11-20, 21-30, 31-40, 41-50 ****;

%MACRO mfmatch;

%DO b=1 %TO 10;

Data Play&b ; Set ST.Reg_likes; 
If Event_Time = &b; Sum_Bull_Likes_&b = Sum_Bull_Likes; Sum_Bear_Likes_&b = Sum_Bear_Likes; 
Keep UserID SymbolID Date Sum_Bull_Likes_&b Sum_Bear_Likes_&b;
run;

%End;

%End;


%MEND mfmatch;

%mfmatch ;
Run;

Data Play11 ; Set ST.Reg_likes; 
If Event_Time >= 11 and Event_Time <= 20; Sum_Bull_Likes_11_20 = Sum_Bull_Likes; Sum_Bear_Likes_11_20 = Sum_Bear_Likes; 
Keep UserID SymbolID Date Sum_Bull_Likes_11_20 Sum_Bear_Likes_11_20;
run;

Proc Sort Data=Play11; by UserID SymbolID Date; run;
Proc Univariate Data=Play11 noprint; by UserID SymbolID Date;
Var Sum_Bull_Likes_11_20 Sum_Bear_Likes_11_20; output out=Play11 Sum=Sum_Bull_Likes_11_20 Sum_Bear_Likes_11_20; run;

Data Play12 ; Set ST.Reg_likes; 
If Event_Time >= 21 and Event_Time <= 30; Sum_Bull_Likes_21_30 = Sum_Bull_Likes; Sum_Bear_Likes_21_30 = Sum_Bear_Likes; 
Keep UserID SymbolID Date Sum_Bull_Likes_21_30 Sum_Bear_Likes_21_30;
run;

Proc Sort Data=Play12; by UserID SymbolID Date; run;
Proc Univariate Data=Play12 noprint; by UserID SymbolID Date;
Var Sum_Bull_Likes_21_30 Sum_Bear_Likes_21_30; output out=Play12 Sum=Sum_Bull_Likes_21_30 Sum_Bear_Likes_21_30; run;

Data Play13 ; Set ST.Reg_likes; 
If Event_Time >= 31 and Event_Time <= 40; Sum_Bull_Likes_31_40 = Sum_Bull_Likes; Sum_Bear_Likes_31_40 = Sum_Bear_Likes; 
Keep UserID SymbolID Date Sum_Bull_Likes_31_40 Sum_Bear_Likes_31_40;
run;

Proc Sort Data=Play13; by UserID SymbolID Date; run;
Proc Univariate Data=Play13 noprint; by UserID SymbolID Date;
Var Sum_Bull_Likes_31_40 Sum_Bear_Likes_31_40; output out=Play13 Sum=Sum_Bull_Likes_31_40 Sum_Bear_Likes_31_40; run;

Data Play14 ; Set ST.Reg_likes; 
If Event_Time >= 41 and Event_Time <= 50; Sum_Bull_Likes_41_50 = Sum_Bull_Likes; Sum_Bear_Likes_41_50 = Sum_Bear_Likes; 
Keep UserID SymbolID Date Sum_Bull_Likes_41_50 Sum_Bear_Likes_41_50;
run;

Proc Sort Data=Play14; by UserID SymbolID Date; run;
Proc Univariate Data=Play14 noprint; by UserID SymbolID Date;
Var Sum_Bull_Likes_41_50 Sum_Bear_Likes_41_50; output out=Play14 Sum=Sum_Bull_Likes_41_50 Sum_Bear_Likes_41_50; run;


**** Now put it altogether **;
Data Play; Set ST.Reg_likes; If Event_Time = 1;
Keep UserID SymbolID Date Bear Bull;
run;

%MACRO mfmatch;

%DO b=1 %TO 14;

Proc Sql;
Create Table Play AS SELECT *
FROM Play a LEFT JOIN Play&b b ON (a.UserID = b.UserID) and (a.SymbolID = b.SymbolID) AND (a.Date = b.Date); QUIT; RUN;

%End;

%End;

%MEND mfmatch;

%mfmatch ;
Run;

Data Echo.FW_likes;
Set Play; run;




*****************************************************************************************************************
*****************************************************************************************************************

10. Make summary stats table

*****************************************************************************************************************
*****************************************************************************************************************;

**** Users *****;

Data Users;
Set ST.USD_panel;

Keep UserID;
run;

Proc Sort Data=Users nodupkey; by UserID; run;

Data Part1;
Set ST.Master_small;

If trim(left(Approach)) = "null" then Approach = "";
If trim(left(Experience)) = "null" then Experience = "";

Keep UserID Approach Experience;
run;

Proc Sort Data=Part1 nodupkey; by UserID Date; run;
Proc Sort Data=Part1 nodupkey; by UserID; run;   **** Keep first instance of approach and experience;

Data Part1;
Set Part1;

Approach1 = Approach;
Experience1 = Experience;

Keep UserID Approach1 Experience1;
run;

Proc Sql;
Create Table Users2 AS
SELECT *
FROM Users a LEFT JOIN Part1 b
ON (a.UserID = b.UserID);
QUIT; RUN;

Proc Sort Data=Users2; by Experience1; run;

Proc Univariate Data=Users2 noprint;
by Experience1;
Var UserID;
output out=Users3 N=N; run;

Data Users3; Set Users3;
Format Title $36.;

If trim(left(experience1)) = "Novice" then Title = "B. Novice";
If trim(left(experience1)) = "Intermediate" then Title = "C. Intermediate";
If trim(left(experience1)) = "Professional" then Title = "D. Professional";
If trim(left(experience1)) = "" then Title = "E. Unclassified";

Total = N;
Keep Title Total;
run;

Proc Univariate Data=Users3 noprint;
Var Total;
output out=Users4 sum=N; run;

Data Users4; Set Users4;
Format Title $36.;
Title = "A. Users";
Total = N;
Keep Title Total;
run;

Data Users5; Set Users3 Users4; run;  Proc Sort Data=Users5; by Title; run;



**** Symbols *****;

Data Symbol1;
Set ST.USD_panel;

Keep SymbolID;
run;

Proc Sort Data=Symbol1 nodupkey; by SymbolID; run;

Data Symbol1B;
Set ST.Master_small;

If SymbolID ne .;

Keep SymbolID Symbol_Name Symbol_Ticker;
run;

Proc Sort Data=Symbol1B; by SymbolID Symbol_Name Symbol_Ticker; run;

Proc Univariate Data=Symbol1B noprint;
by SymbolID Symbol_Name Symbol_Ticker;
Var SymbolID;
output out=Symbol1B N=N; run;

Proc Sort Data=Symbol1B; by SymbolID DESCENDING N; run;

Data Symbol1B2;
Set Symbol1B;

Lag1SymbolID = lag1(SymbolID);
Lag1Symbol_Name = lag1(Symbol_Name);
Lag1Symbol_Ticker = lag1(Symbol_Ticker);
Lag1N = lag1(N);

If SymbolID = Lag1SymbolID;

run;

Proc Sort Data=Symbol1B; by SymbolID DESCENDING N; run;
Proc Sort Data=Symbol1B nodupkey; by SymbolID; run;   *** Keep only most popular by SymbolID;

Proc Sort Data=Symbol1B2; by SymbolID DESCENDING Lag1N; run;
Proc Sort Data=Symbol1B2 nodupkey; by SymbolID; run;   *** Keep only 2nd most popular by SymbolID;

Data Symbol1B2;
Set Symbol1B2;

Symbol_Name_2 = Symbol_Name;
Symbol_Ticker_2 = Symbol_Ticker;
N_2 = N;

Keep SymbolID Symbol_Name_2 Symbol_Ticker_2 N_2;
run;

Proc Sql;
Create Table Symbol1B AS
SELECT *
FROM Symbol1B a LEFT JOIN Symbol1B2 b
ON (a.SymbolID = b.SymbolID);
QUIT; RUN;

Proc Sql;
Create Table Symbol1 AS
SELECT *
FROM Symbol1 a LEFT JOIN Symbol1B b
ON (a.SymbolID = b.SymbolID)
Where (a.SymbolID ne .)
;
QUIT; RUN;

**** Save Our List of SymbolIDs with Name AND Ticker *****;
Data ST.Our_Symbols;
Set Symbol1;
run;



**** Merge with our SymbolID and Permno Link File *****;
Proc Sql;
Create Table Symbol1 AS
SELECT *
FROM Symbol1 a LEFT JOIN Echo.Permno_symbol_match_final b
ON (a.SymbolID = b.SymbolID);
QUIT; RUN;

Data Symbol2;
Set Symbol1;

Match_CRSP = 0;
If permno ne . then Match_CRSP = 1;

Keep SymbolID Match_CRSP;
run;


Proc Sort Data=Symbol2; by Match_CRSP; run;

Proc Univariate Data=Symbol2 noprint;
by Match_CRSP;
Var SymbolID;
output out=Symbol3 N=N; run;

Data Symbol3; Set Symbol3;
Format Title $36.;

If Match_CRSP = 1 then Title = "G. CRSP";
If Match_CRSP = 0 then Title = "H. Non-CRSP";

Total = N;
Keep Title Total;
run;

Proc Univariate Data=Symbol3 noprint;
Var Total;
output out=Symbol4 sum=N; run;

Data Symbol4; Set Symbol4;
Format Title $36.;
Title = "F. Symbols";
Total = N;
Keep Title Total;
run;

Data Symbol5; Set Symbol3 Symbol4; run;  Proc Sort Data=Symbol5; by Title; run;


**** Sentiment Messages *****;

Data Messages1;
Set Temp1c;
If Bull2 = . then Bull2 = 0;  If Bear2 = . then Bear2 = 0;
Sent_Messages = Bull2 + Bear2;
Keep Bull2 Bear2 Sent_Messages;
run;

Proc Univariate Data=Messages1 noprint;
Var Bull2;
output out=Messages2 Sum=N; run;

Data Messages2; Set Messages2;
Format Title $36.;
Title = "J. Bullish";
Total = N;
Keep Title Total;
run;

Proc Univariate Data=Messages1 noprint;
Var Bear2;
output out=Messages3 Sum=N; run;

Data Messages3; Set Messages3;
Format Title $36.;
Title = "K. Bearish";
Total = N;
Keep Title Total;
run;

Proc Univariate Data=Messages1 noprint;
Var Sent_Messages;
output out=Messages4 Sum=N; run;

Data Messages4; Set Messages4;
Format Title $36.;
Title = "I. Sentiment Messages";
Total = N;
Keep Title Total;
run;

Data Messages5; Set Messages2 Messages3 Messages4; run;

Proc Sort Data=Messages5; by Title; run;


**** Days *****;

Data Days1;
Set ST.USD_Panel;
Format ST_Date date8.;
ST_Date = Date;
Keep ST_Date;
run;

Proc Sort Data=Days1 nodupkey; by ST_Date; run;

Data Days2;
Set Echo.Main_crsp Echo.Crsp2020jan_jun;   ***** Daily is the Daily CRSP File *****;
Format CRSP_Date date8.;
CRSP_Date = Date;
Keep CRSP_Date;
run;

Proc Sort Data=Days2; by CRSP_Date; run;

Proc Univariate Data=Days2 noprint;
by CRSP_Date;
Var CRSP_Date;
output out=Days2 N=N; run;

Data Days2;
Set Days2;

If N > 1000;

run;

Proc Sql;
Create Table Days3 AS
SELECT *
FROM Days1 a LEFT JOIN Days2 b
ON (a.ST_Date = b.CRSP_Date)
Where (a.ST_Date ne .);
QUIT; RUN;

Data Days3; set Days3; Trade_Day = 0; If N ne . then Trade_Day = 1; run; 

Proc Sort Data=Days3; by Trade_Day; run;

Proc Univariate Data=Days3 noprint;
by Trade_Day;
Var ST_Date;
output out=Days4 N=N; run;

Data Days4; Set Days4;
Format Title $36.;

If Trade_Day = 1 then Title = "M. Trading Days";
If Trade_Day = 0 then Title = "N. Non-Trading Days";

Total = N;
Keep Title Total;
run;

Proc Univariate Data=Days4 noprint;
Var Total;
output out=Days5 sum=N; run;

Data Days5; Set Days5;
Format Title $36.;
Title = "L. Days";
Total = N;
Keep Title Total;
run;

Data Days5; Set Days4 Days5; run;  Proc Sort Data=Days5; by Title; run;


**** User-Symbol-Sentiment Days *****;

Data USSD1;
Set ST.USD_Panel;
run;

Proc Univariate Data=USSD1 noprint;
Var SymbolID;
output out=USSD1 N=N; run;

Data USSD5; Set USSD1;
Format Title $36.;
Title = "O. User-Symbol-Sentiment Days";
Total = N;
Keep Title Total;
run;

**** Putting it Altogether *****;

Data ST.Summary_Table_PanelA; Set Users5 Symbol5 Messages5 Days5 USSD5; run;



* Summary Stats Table - Second Panel *************************;

Data FT;
Set ST.USD_Panel;

Final_Bull = Bull;
Final_Bear = Bear;

If Final_Bull = . then Final_Bull = 0;
If Final_Bear = . then Final_Bear = 0;
Keep UserID SymbolID Date Final_Bull Final_Bear;
run;

Proc Sort Data=FT; by UserID Date; run;

Proc Univariate Data=FT noprint;
by UserID Date;
Var Final_Bull Final_Bear;
output out=FT2 sum=Final_Bull Final_Bear; run;

Data FT2; Set FT2; 
Sum = Final_Bull+Final_Bear;
If Sum in (2, 3, 4);
run;

Proc Sort Data=FT2; by Sum DESCENDING Final_Bull Final_Bear; run;

Proc Univariate Data=FT2 noprint;
by Sum DESCENDING Final_Bull Final_Bear;
Var UserID;
output out=FT3 N=N; run;

Proc Sort Data=FT3; by Sum; run;

Proc Univariate Data=FT3 noprint;
by Sum;
Var N;
output out=FT4 sum=Total; run;

Proc Sql;
Create Table FT5 AS
SELECT *
FROM FT3 a LEFT JOIN FT4 b
ON (a.Sum = b.Sum);
QUIT; RUN;

Proc Sort Data=FT2; by Sum; run;

Proc Univariate Data=FT2 noprint;
by Sum;
Var Final_Bull Final_Bear;
output out=FT6 Sum=Total_Bull Total_Bear; run;

Proc Sql;
Create Table FT7 AS
SELECT *
FROM FT5 a LEFT JOIN FT6 b
ON (a.Sum = b.Sum);
QUIT; RUN;

Data FT8;
Set FT7;

p = Total_Bull/ (Total_Bull+Total_Bear);

Prob = PDF ('BINOMIAL', Final_Bull, p, sum);

Freq = N / Total;

Keep Final_Bull Final_Bear Prob Freq;
run;

Data ST.Summary_Table_PanelB;
Set FT8;
run;


*****************************************************************************************************************
*****************************************************************************************************************

11. Delete excess datasets

*****************************************************************************************************************
*****************************************************************************************************************;

proc delete data=Sub0-Sub99;
run;

proc delete data=play1-play14;
run;

proc delete data=Cumulative play ET1 ET1C ET2 ET2C Regression Main Main1 Main2 Main3 Main4 Main5 Temp1 Temp4 Temp1c Likes2 Likes3;
run;



*****************************************************************************************************************
*****************************************************************************************************************

12. Build "Information Silo" Dataset and make the regression dataset for Table 9

*****************************************************************************************************************
*****************************************************************************************************************;

*** Transform Bull, Bear Sent to -1 and +1 a la Cookson and Neissner (2020);
Data Main1;
Set ST.USD_Panel;

If Bull = 1 then Sent = 1;
If Bear = 1 then Sent = -1;

Drop Bull Bear;
run;

*** Prepare following dataset, calls users sheep;
Data Main2;
Set ST.Following;

SheepID = UserID;

Drop UserID ID;
run;



*** Now see on a given data what users are exposed to given they see at least two messages (can't calc 
Std of sentiment received within user-symbol without at least two ***;

Proc printto log=junk; run;

%MACRO mfmatch;

%DO b=0 %TO 99;

Data Sub&b;
Set Main2;
where mod(SheepID, 100) = &b;
run;


Proc Sql;
Create Table Sub&b AS

SELECT *

FROM Main1 a LEFT JOIN Sub&b b

ON (b.Date_Created < a.Date) AND (b.Date_Destroyed > a.Date) AND (a.UserID = b.FollowingUserID)

WHERE (a.UserID ne . AND b.Date_Created ne .)
;
QUIT; RUN;



PROC SQL;
Create Table Sub&b AS
  SELECT Avg(Sent) AS Av_Sent, Std(Sent) AS Std_Sent, N(Sent) AS Num_Sent, Date, SymbolID, SheepID
    FROM Sub&b
      GROUP BY Date, SymbolID, SheepID;
QUIT;
Run;


Data Sub&b; Set Sub&b; Where Std_Sent ne .; run;

%End;

Data Cumulative;
Set Sub0-Sub99; run;

%MEND mfmatch;

%mfmatch ;
Run;

Proc printto; run;

Proc Sort Data=Cumulative; by SymbolID Date; run;

Proc Univariate Data=Cumulative noprint;
by SymbolID Date;
Var Av_Sent Std_Sent;
output out=Regression mean=mean1 mean2 std=std1 std2 N=N1 N2; run;



**** Now add overall messages, std and avg sentiment from Main1 ****;

Proc Sort Data=Main1; by SymbolID Date; run;

Proc Univariate Data=Main1 noprint;
by SymbolID Date;
Var Sent;
output out=Add mean=mean std=std N=N; run;

Data Add;
Set Add;

Std_Orig_Signals = Std;

Av_Sent_Orig_Signals = mean;

Num_Orig_Signals = N;

Keep Date SymbolID Num_Orig_Signals Std_Orig_Signals Av_Sent_Orig_Signals;
run;

Proc Sql;
Create Table Regression AS

SELECT *

FROM Regression a LEFT JOIN Add b

ON (a.SymbolID = b.SymbolID) AND (a.Date = b.Date)

;
QUIT; RUN;

*** Make regression dataset for Table 9 *****;
Data Regression;
Set Regression;

Std_Rec_Mean_Signal = std1;

Mean_Rec_Signal_Std = mean2;

Num_Receivers = N2;

Keep SymbolID Date Std_Orig_Signals Std_Rec_Mean_Signal Mean_Rec_Signal_Std Num_Orig_Signals Av_Sent_Orig_Signals Num_Receivers;
run;

Data Echo.Fw_Info_Silo;
Set Regression;

run;


*****************************************************************************************************************
*****************************************************************************************************************

13. Now do Combinatorics to see likelihood of receiving all the same message and make regression dataset 
    relating this to declared bull/bear

*****************************************************************************************************************
*****************************************************************************************************************;

Data Prob1;
Set Cumulative;

Num_Chosen = Num_Sent;
Num_Bull = Num_Sent - (Num_Sent-Av_Sent*Num_Sent)/2;

Drop Num_Sent;
run;

Data Prob2;
Set Add;

Num_All = Num_Orig_Signals;

Num_Bull_All = Num_All - (Num_All - Av_Sent_Orig_Signals*Num_All)/2;

Keep SymbolID Date Num_All Num_Bull_All;
run;

Proc Sql;
Create Table Prob1 AS
SELECT *
FROM Prob1 a LEFT JOIN Prob2 b
ON (a.SymbolID = b.SymbolID) AND (a.Date = b.Date)
WHERE (a.Num_Chosen ne .) AND (b.Num_All ne .);
QUIT; RUN;

Data Prob1;
Set Prob1;

Part1 = comb(Num_Bull_All,Num_Chosen);
Part2 = comb(Num_All - Num_Bull_All,Num_Chosen);
Part3 = comb(Num_All,Num_Chosen);

If Part1 = . then Part1 = 0;
If Part2 = . then Part2 = 0;

Prob_Bull_Same = Part1/Part3;
Prob_Bear_Same = Part2/Part3;

Prob_All_Same = (Part1 + Part2)/Part3;

Drop Part1 Part2 Part3;
run;

Data Prob2;
Set Prob1;

All_Same = 0;
If Std_Sent = 0 then All_Same = 1;

Round_Prob = (Floor(Prob_All_Same*20))/20;

If Prob_All_Same = 1 or Prob_All_Same = 0 then delete;

run;

**** Figure 6 for the paper ****;

PROC SQL;
Create Table ST.Figure6 AS
  SELECT Avg(All_Same) AS All_Same_Freq, N(All_Same) AS N_Obs, Round_Prob
    FROM Prob2
      GROUP BY Round_Prob;
QUIT;
Run;

******** Now look back over last week (7 days) and relate to declared bull/bear for regression *********;

Proc Sql;
Create Table EC1 AS
SELECT *
FROM Prob1 a LEFT JOIN Temp1b b
ON (a.SymbolID = b.SymbolId2) AND (a.SheepID = b.UserID2) AND (a.Date > b.Date2) AND (a.Date <= (b.Date2 + 7)) AND (a.SymbolID = b.SymbolId2)
WHERE (a.SheepID ne .) and (b.Date2 ne .)
;
QUIT; RUN;

*** If more than one declaration, keep ,ost recent ****;
Proc Sort Data=EC1; by SymbolID Date SheepID DESCENDING Date2; run;
Proc Sort Data=EC1 nodupkey; by SymbolID Date SheepID; run;

Data Echo.Fw_Combinatorics;
Set EC1;

If Date2 = . then delete;

If Bull2 = . then Bull2 = 0;
If Bear2 = . then Bear2 = 0;

All_Bull = 0;
If Num_Chosen = Num_Bull then All_Bull = 1;

All_Bear = 0;
If Num_Bull = 0 then All_Bear = 1;

If Prob_Bull_Same = 0 or Prob_Bear_Same = 0 then delete;

run;


*****************************************************************************************************************
*****************************************************************************************************************

14. Another round of deleting excess datasets

*****************************************************************************************************************
*****************************************************************************************************************;








*****************************************************************************************************************
*****************************************************************************************************************

15. Make messages file to be classified as buy/sell trades

*****************************************************************************************************************
*****************************************************************************************************************;

Data Messages_TBC;
set ST.Master;

Keep Message MessageID;
run;

PROC EXPORT DATA= Messages_TBC
            OUTFILE= "C:\Users\jengelberg\Dropbox\Echo\Messages_TBC.csv" 
            DBMS=CSV REPLACE;
     PUTNAMES=YES;
RUN;


***** Received back, now read in *****;

PROC IMPORT OUT= Received
            DATAFILE= "C:\Users\jengelberg\Dropbox\Echo\stocktwits_results.csv" 
            DBMS=CSV REPLACE;
     GETNAMES=YES;
     DATAROW=2; 
RUN;

Data ST.Trading_Classified_Messages;
Set Received;

Drop body; run;




*****************************************************************************************************************
*****************************************************************************************************************

16. Create rolling newsfeed - basically the same as step 6 except looking BACK rather than forward

*****************************************************************************************************************
*****************************************************************************************************************;

**** Look back over previous 30 days ****;

Data Main;
Set ST.USD_Panel;
Format Date_Begin Date8. Date_End Date8.;

Date_Begin = Date - 30;
Date_End = Date - 1;

run;

Proc printto log=junk; run;

%MACRO mfmatch;

%DO b=0 %TO 99;

Data Sub&b;
Set Main;
where mod(UserID, 100) = &b;
run;


Proc Sql;
Create Table Sub&b AS

SELECT a.Date, a.SymbolID, a.UserID, c.UserID2, a.Bull, a.Bear, c.Bear2, c.Bull2, c.Date2

FROM Sub&b a LEFT JOIN ST.Following b

ON (a.UserID = b.UserID)

LEFT JOIN Temp1B c

ON (b.FollowingUserID = c.UserId2) AND (a.SymbolID = c.SymbolID2)

AND (a.Date_Begin <= c.Date2) AND (a.Date_End >= c.Date2)

AND (c.Date2 > b.Date_Created) AND (c.Date2 < b.Date_Destroyed) 

WHERE (c.Bull2 = 1 OR c.Bear2 = 1)
;
QUIT; RUN;



%End;

Data Cumulative;
Set Sub0-Sub99; run;

%End;


%MEND mfmatch;

%mfmatch ;
Run;

Proc printto; run;


Data LB_30;
Set Cumulative;

run;

PROC SQL;
Create Table Echo.FW_rolling_newsfeed AS
  SELECT Sum(Bull2) AS Sum_Bull, Sum(Bear2) AS Sum_Bear, Bull, Bear, UserID, SymbolID, Date
    FROM LB_30
      GROUP BY UserID, SymbolID, Date, Bull, Bear;
QUIT;
Run;






*****************************************************************************************************************
*****************************************************************************************************************

17. Create File Which Measures User Characteristics: Follows, Following, Experience, Total Activity

*****************************************************************************************************************
*****************************************************************************************************************;

***** Count total posts, likes, follows for each user ****;

Data ActiveUsers;
Set ST.Usd_panel ST.Likes_Create ST.Friends_Create;

If Date >= MDY(1, 1, 2013);

Keep UserID Date; run;

Proc Sort Data=ActiveUsers; by UserID; run;

Proc Univariate Data=ActiveUsers noprint;
by UserID;
Var Date;
output out=ActiveUsers2 N=N; run;

Data Echo.Fw_UserActivity;
Set ActiveUsers2;
run;


**** Add Follows Following and Experience.  First get last observation per user symbol day *****;

Data Chars;
set ST.Master;

Keep UserID SymbolID Date Time Approach Experience FollowersCount FollowingCount;
run;

Proc Sort Data=Chars; by UserID SymbolID Date DESCENDING Time; run;
Proc Sort Data=Chars nodupkey; by UserID SymbolID Date; run;

Data Chars;
Set Chars;

Keep UserID SymbolID Date Approach Experience FollowersCount FollowingCount;
run;

Proc Sql;
Create Table Chars2 AS
SELECT *
FROM ST.USD_panel a LEFT JOIN Chars b
ON (a.UserID = b.UserID) AND (a.SymbolID = b.SymbolId) AND (a.Date = b.Date)
;
QUIT; RUN;

Proc Sql;
Create Table Chars2 AS
SELECT *
FROM Chars2 a LEFT JOIN ActiveUsers2 b
ON (a.UserID = b.UserID)
;
QUIT; RUN;

Data Chars2;
Set Chars2;

Total_Activity_Over_Sample = N;

Keep UserID SymbolID Date Approach Experience FollowersCount FollowingCount Total_Activity_Over_Sample;
run;

Data Echo.FW_Chars;
Set Chars2;
run;




*****************************************************************************************************************
*****************************************************************************************************************

18. Dataset with just MessageID UserID Date SymbolID

*****************************************************************************************************************
*****************************************************************************************************************;

Data Echo.Fw_Trade_Add_On;
Set ST.Master_Small;

Keep MessageID UserID Date SymbolID;
run;




*****************************************************************************************************************
*****************************************************************************************************************

19. Create Figure 1, which is sentiment persistence

*****************************************************************************************************************
*****************************************************************************************************************;

Data PS1;
Set ST.Usd_panel;
Format Date_Begin Date8. Date_End Date8.;

Date_Begin = Date + 1;
Date_End = Date + 50;

run;

Data Temp1B;
Set ST.Usd_panel;
Format Date2 Date8.;

UserID2 = UserID;
Date2 = Date;
Bear2 = Bear;
Bull2 = Bull;
SymbolID2 = SymbolID;

Keep Date2 UserID2 Bear2 Bull2 SymbolID2;
run;

Proc Sql;
Create Table PS1 AS

SELECT a.Date, a.SymbolID, a.UserID, a.Bull, a.Bear, b.Bear2, b.Bull2, b.Date2 

FROM PS1 a LEFT JOIN Temp1b b

ON (a.UserID = b.UserID2) AND (a.SymbolID = b.SymbolID2) AND (a.Date_Begin <= b.Date2) AND (a.Date_End >= b.Date2)

WHERE (a.Date ne . AND b.Date2 ne .)
;
QUIT; RUN;

Data PS1;
Set PS1;

Event_Time = Date2 - Date;

run;

PROC SQL;
Create Table PS2 AS
  SELECT Sum(Bull2) AS Sum_Bull, Sum(Bear2) AS Sum_Bear, Event_Time, Bull, Bear
    FROM PS1
      GROUP BY Bull, Bear, Event_Time;
QUIT;
Run;

Proc Univariate Data=ST.Usd_panel noprint;
Var Bull Bear;
output out=PS3 Sum=Bull_Sum Bear_Sum; run;

Data PS3; Set PS3;
Do i = 1 to 2; output; end; run;

Data PS3;
Set PS3;

If i = 1 then do;

Bull = 1;
Unconditional = Bull_Sum/ (Bull_Sum + Bear_Sum);

end;

If i = 2 then do;

Bear = 1;
Unconditional = Bear_Sum/ (Bull_Sum + Bear_Sum);

end;

Keep Bull Bear Unconditional;
run;

Proc Sql;
Create Table PS4 AS

SELECT *

FROM PS2 a LEFT JOIN PS3 b

ON (a.Bull = b.Bull) AND (a.Bear = b.Bear)
;
QUIT; RUN;





*****************************************************************************************************************
*****************************************************************************************************************

20. Find the Messages with URLs, Count the Number per User

*****************************************************************************************************************
*****************************************************************************************************************;

filename FT77F004 "E:\StockTwits\Legacy\stocktwits_legacy_messages*";

Data Link1;
INFILE FT77F004 dsd DLM='*' dsd lrecl=12336;
informat var $12336.;
input var $ @@;

Format UserID 12. Message $1200. FollowersCount 12. FollowingCount 12. FollowingStocksCount 12. StatusesCount 12.
	   Approach $14. Experience $14.
	   MessageID 12. SymbolID 12. SymbolText $412. Num_Symbols 12. Symbol_Name $36. Symbol_Ticker $8.
       Bear 12. Bull 12. Date Date8. Time Time8. Part1 $412. Text1 $12336.  Text2 $12336.;

If index(var, '{"verb":"post","id":"tag:firehose.stocktwits.com:note/') ne 0 and 
index(var, '{"id":"person:stocktwits:') ne 0 and
index(var, '{"id":"note:stocktwits:') ne 0 and
index(var, ',"postedTime":"') ne 0 

/*
and index(var, '"symbols":[{"displayName":') ne 0 
and index(var, '"sentiment":{"basic":"') ne 0
*/

then do;

Num1 = index(var, '{"verb":"post","id":"tag:firehose.stocktwits.com:note/');
Num2 = index(var, '{"id":"person:stocktwits:');
Num3 = index(var, '{"id":"note:stocktwits:');
Num4 = index(var, ',"postedTime":"');
Num5 = index(var, '"symbols":[{"displayName":');
Num6 = index(var, '"sentiment":{"basic":"');

Num7 = index(var, '","body":"') + 10;
Num8 = index(var, '","actor":{') - Num7;

Num9 = Num5 + 26;
Num10 = Num6 - Num9;



MessageID = Input( scan( substr(var, Num1), 9, '/"'), 12.);

UserID = Input( scan( substr(var, Num2), 5, ':"'), 12.);

Text1 = substr(var, Num2);

Num2a = index(Text1, '"followersCount":');
Num2b = index(Text1, '"followingCount":');
Num2c = index(Text1, 'followingStocksCount":');
Num2d = index(Text1, '"statusesCount":');
Num2e = index(Text1, '"approach":');
Num2f = index(Text1, '"experience":');

FollowersCount = Input( scan( substr(Text1, Num2a), 2, ':",'), 12.);
FollowingCount = Input( scan( substr(Text1, Num2b), 2, ':",'), 12.);
FollowingStocksCount = Input( scan( substr(Text1, Num2c), 2, ':",'), 12.);
StatusesCount = Input( scan( substr(Text1, Num2d), 2, ':",'), 12.);
Approach = compress( trim(left(scan( substr(Text1, Num2e), 2, ':,'))), '"}');
Experience = compress( trim(left(scan( substr(Text1, Num2f), 2, ':,'))), '"}');

Message = trim(left(substr(var, Num7, Num8)));

MessageID2 = Input( scan( substr(var, Num3), 5, ':"'), 12.);

Part1 = compress( scan( substr(var, Num4), 4, '"'), '"');

Year = Input( scan(Part1, 1, '-TZ:+'), 12.);
Month = Input( scan(Part1, 2, '-TZ:+'), 12.);
Day = Input( scan(Part1, 3, '-TZ:+'), 12.);

Hour = Input( scan(Part1, 4, '-TZ:+'), 12.);
Minute = Input( scan(Part1, 5, '-TZ:+'), 12.);
Second = Input( scan(Part1, 6, '-TZ:+'), 12.);

Date = MDY(Month, Day, Year);

Time = HMS(Hour, Minute, Second);


Text2 = substr(var, Num5);

Num5a = index(Text2, '"stocktwits_id":');

SymbolID = Input( scan( substr(Text2, Num5a), 2, ':",'), 12.);

SymbolText = trim(left( substr(var, Num9, Num10)  ));

Symbol_Name = trim(left( compress( scan(SymbolText, 1, ','), '"') ));


Num11 = index(SymbolText, ',"symbol":') + 10;
Text3 = substr(SymbolText, Num11);

Symbol_Ticker = trim(left( compress( scan(Text3, 1, ','), '"') ));

Num_Symbols = Count(SymbolText, '"stocktwits_id":');

Sent = scan( substr(var, Num6), 4, '":,');

If trim(left(Sent)) = "Bearish" then Bear = 1;
If trim(left(Sent)) = "Bullish" then Bull = 1;


If index( Upcase(Message), "HTTP") ne 0 then do; output; end;

end;




Keep  Message UserID MessageID Date;
run;

Data Link1;
Set Link1;

Format Word $600.;

Words = CountW(Message, " ");

Do i = 1 to Words;

Word = trim(left(scan(Message, i, " ")));

If index(Upcase(Word), "HTTP") ne 0 then do;  output; end;

end;

Keep UserID MessageID Date Word;
run;


Data Link1;
Set Link1;
Format Word_Final $250.;

Word = TranWRD(Word, "\n", " ");

Words = CountW(Word, " ");

Do i = 1 to Words;

Word_Final = trim(left(scan(Word, i, " ")));

If index(Upcase(Word_Final), "HTTP") ne 0 then do;  output; end;

end;

Keep UserID MessageID Date Word_Final;

run;

Data Link1;
Set Link1;
Format Link $250.;

Mark = index( Upcase(Word_Final),"HTTP://");

Link = substr(Word_Final, Mark);

If Date >= MDY(1, 1, 2013);  

Keep UserID MessageID Date Link;
run;

Proc Sort Data=Link1; by UserID; run;

Proc Univariate Data=Link1 noprint;
by UserID;
Var Date;
output out=Link2 N=Total_Links; run;


Data Link1_MC; **** Count Number of Messages ****;
Set St.Master_small;

If Date >= MDY(1, 1, 2013);

Keep UserID MessageID Date;
run;

Proc Sort Data=Link1_MC; by UserID; run;

Proc Univariate Data=Link1_MC noprint;
by UserID;
Var MessageID;
output out=Link1_MC2 N=Total_Messages; run;


Proc Sql;  *** Merge it all together ***;
Create Table Link3 AS
SELECT *
FROM Link1_mc2 a LEFT JOIN Link2 b
ON (a.UserID = b.UserID);
QUIT; RUN;


Data Echo.Credible_Links;
set Link3;

If Total_Links = . then Total_Links = 0;

run;



